Exploring Carnatic Performance#
Thomas Nuttall, Genís Plaja-Roglans, Lara Pearson, Brindha Manickavasakan, Xavier Serra.
This notebook serves to demonstrate the wide range of tools available as part of the compIAM package. We demonstrate their use on a single performance from the Saraga Audiovisual Dataset, the multi-modal portion of the wider Saraga Dataset [1, 2]. The tools showcased here are not accompanied by exhaustive usage documentation, which can be found in their respective pages in other parts of this webbook, links for which are provided in each section.
1. Import Dependencies and Data#
Due to restrictions in accessing the Saraga API through the Github hosted webbook, we access the data through a custom shared Google drive created specifically for this tutorial. Users wishing to work with audio from Saraga should follow the instructions here.
## Installing (if not) and importing compiam to the project
import importlib.util
%pip install -U compiam==0.4.1 # Install latest version of compiam
if importlib.util.find_spec('essentia') is None:
%pip install essentia
if importlib.util.find_spec('torch') is None:
%pip install "torch==1.13"
if importlib.util.find_spec('tensorflow') is None:
%pip install "tensorflow==2.15.0" "keras<3"
import compiam
import essentia.standard as estd
# Import extras and supress warnings to keep the tutorial clean
import os
import shutil
import gdown
import zipfile
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
AUDIO_PATH = os.path.join("..", "audio", "demos")
url = "https://drive.google.com/uc?id=1iR0bfxDLQbH8fEeHU_GFsg2kh7brZ0HZ&export=download"
output = os.path.join(AUDIO_PATH, "dr-brindha-manickavasakan.zip")
gdown.download(url, output, quiet=False)
# Unzip file
with zipfile.ZipFile(output, 'r') as zip_ref:
zip_ref.extractall(AUDIO_PATH)
# Delete zip file after extraction
os.remove(output)
2. Loading and visualising the data#
We work with a single performance from a concert by Brindha Manickavasakan at the Arkay Convention Center, recorded in 2023 in Chennai, South India. The composition is Bhavanuta by Tyaagaraaja in raga mohanam.
rendition = "Bhavanuta"
folder_path = os.path.join(AUDIO_PATH, 'dr-brindha-manickavasakan', rendition)
For 100s of performances in the Saraga dataset, the audio stems corresponding to each instrument/perfromer are available. In this performance, this constitutes the lead vocal, the mridangam (left and right microphone), the violin, and the tanpura. The full mix of all instruments is also available.
Let us select the preview versions of the multitrack audio, which are shortened and compressed versions of the rendition for easier handling of the previsualisation.
audio_path_pre = os.path.join(folder_path, "preview", f"{rendition}.mp3")
mrid_left_path_pre = os.path.join(folder_path, "preview", f"{rendition}.mridangam-left.mp3")
mrid_right_path_pre = os.path.join(folder_path, "preview", f"{rendition}.mridangam-right.mp3")
violin_path_pre = os.path.join(folder_path, "preview", f"{rendition}.multitrack-violin.mp3")
vocal_path_pre = os.path.join(folder_path, "preview", f"{rendition}.multitrack-vocal.mp3")
tanpura_path_pre = os.path.join(folder_path, "preview", f"{rendition}.tanpura.mp3")
1.1 Multitrack player#
We can use the compIAM waveform player to visualise and listen to all of the tracks at the same time, panning, or changing the volume of each as required.
from compiam.visualisation.waveform_player import Player
# list of paths to load and listen
all_audio_paths = [
vocal_path_pre,
violin_path_pre,
mrid_left_path_pre,
mrid_right_path_pre,
tanpura_path_pre
]
# List of labels for each path
all_names = ["Vocal", "Violin", "Mridangam left", "Mridangam right", "Tanpura"]
Player(all_names, all_audio_paths)
/opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/site-packages/compiam/visualisation/waveform_player/waveform-playlist/
multi-channel.html
1.2 Video and Gesture Tracks#
The Saraga Audiovisual dataset includes videos of the performances and gesture tracks extracted using MMPose for the lead performer [3]. Let’s take a look at a sample for this performance.
import cv2
import IPython.display as ipd
from IPython.core.display import HTML
vid_out_path = f'{folder_path}/output_segment.mp4'
# Load keypoints and scores
keypoints_file = f"{folder_path}/singer/Brindha_Manickavasakan_Segment1_0-513_kpts.npy"
scores_file = f"{folder_path}/singer/Brindha_Manickavasakan_Segment1_0-513_scores.npy"
video_file = f"{folder_path}/{rendition}.mov" # Replace with your video file
keypoints = np.load(keypoints_file)
scores = np.load(scores_file)
# Skeleton for 135 keypoints
# Skeleton for 135 keypoints (MMPose)
skeleton = [
(0, 1), (1, 2), # Eyes (left to right)
(0, 3), (0, 4), # Nose to ears (left and right)
(5, 6), # Shoulders (left and right)
(5, 7), (7, 9), # Left arm (shoulder -> elbow -> wrist)
(6, 8), (8, 10),
(11,12), # Right arm (shoulder -> elbow -> wrist)
(5, 11), (6, 12), # Shoulders to hips
(11, 13), (13, 15), # Left leg (hip -> knee -> ankle)
(12, 14), (14, 16) # Right leg (hip -> knee -> ankle)
]
# Open video file
cap = cv2.VideoCapture(video_file)
fps = int(cap.get(cv2.CAP_PROP_FPS)) # Frames per second
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Define start and end frames for the 20-second segment
start_time = 10 # Start time in seconds (adjust as needed)
end_time = start_time + 20 # End time in seconds
start_frame = int(start_time * fps)
end_frame = int(end_time * fps)
# Output video writer
out = cv2.VideoWriter(vid_out_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
# Process the selected frames
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if start_frame <= frame_idx < end_frame:
# Get keypoints and scores for the current frame
if frame_idx < len(keypoints):
frame_keypoints = keypoints[frame_idx]
frame_scores = scores[frame_idx]
# Draw keypoints and skeleton
for i, (x, y) in enumerate(frame_keypoints):
# Only draw if confidence score is above threshold
if frame_scores[i] > 0.5: # Adjust threshold as needed
cv2.circle(frame, (int(x), int(y)), 5, (0, 255, 0), -1)
# Draw skeleton
for connection in skeleton:
start, end = connection
if frame_scores[start] > 0.5 and frame_scores[end] > 0.5:
x1, y1 = frame_keypoints[start]
x2, y2 = frame_keypoints[end]
cv2.line(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 2)
# Write frame to output video
out.write(frame)
frame_idx += 1
# Stop processing after the end frame
if frame_idx >= end_frame:
break
# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()
print("20-second video segment processing complete. Output saved as 'output_segment.mp4'")
20-second video segment processing complete. Output saved as 'output_segment.mp4'
#video_html = f"""
#<video width="640" height="480" controls>
# <source src="{vid_out_path}" type="video/mp4">
# Your browser does not support the video tag.
#</video>
#"""
#ipd.display(HTML(video_html))
from IPython.core.display import Video
Video(vid_out_path, embed=True)